{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Description:\n",
    "这是一个使用协同过滤算法的小案例， 实现一下博客上的那个用户评分的案例， 任务是预测某个用户对于某个物品的打分情况，具体可以参考给出的博客链接。"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-08-20T11:35:38.597688Z",
     "start_time": "2020-08-20T11:35:37.382263Z"
    }
   },
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import pandas as pd\n",
    "from sklearn.metrics.pairwise import cosine_similarity\n",
    "\n",
    "import warnings \n",
    "warnings.filterwarnings('ignore')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-08-20T11:35:40.070009Z",
     "start_time": "2020-08-20T11:35:40.064066Z"
    }
   },
   "outputs": [],
   "source": [
    "# 定义数据集， 也就是那个表格， 注意这里我们采用字典存放数据， 因为实际情况中数据是非常稀疏的， 很少有情况是现在这样\n",
    "def loadData():\n",
    "    items={'A': {1: 5, 2: 3, 3: 4, 4: 3, 5: 1},\n",
    "           'B': {1: 3, 2: 1, 3: 3, 4: 3, 5: 5},\n",
    "           'C': {1: 4, 2: 2, 3: 4, 4: 1, 5: 5},\n",
    "           'D': {1: 4, 2: 3, 3: 3, 4: 5, 5: 2},\n",
    "           'E': {2: 3, 3: 5, 4: 4, 5: 1}\n",
    "          }\n",
    "    users={1: {'A': 5, 'B': 3, 'C': 4, 'D': 4},\n",
    "           2: {'A': 3, 'B': 1, 'C': 2, 'D': 3, 'E': 3},\n",
    "           3: {'A': 4, 'B': 3, 'C': 4, 'D': 3, 'E': 5},\n",
    "           4: {'A': 3, 'B': 3, 'C': 1, 'D': 5, 'E': 4},\n",
    "           5: {'A': 1, 'B': 5, 'C': 5, 'D': 2, 'E': 1}\n",
    "          }\n",
    "    return items,users"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-08-20T11:35:41.191078Z",
     "start_time": "2020-08-20T11:35:41.185053Z"
    }
   },
   "outputs": [],
   "source": [
    "items, users = loadData()\n",
    "item_df = pd.DataFrame(items).T\n",
    "user_df = pd.DataFrame(users).T"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-08-20T11:50:01.954452Z",
     "start_time": "2020-08-20T11:50:01.944477Z"
    }
   },
   "outputs": [],
   "source": [
    "\"\"\"计算用户相似性矩阵\"\"\"\n",
    "similarity_matrix = pd.DataFrame(np.zeros((len(users), len(users))), index=[1, 2, 3, 4, 5], columns=[1, 2, 3, 4, 5])\n",
    "\n",
    "# 遍历每条用户-物品评分数据\n",
    "for userID in users:\n",
    "    for otheruserId in users:\n",
    "        vec_user = []\n",
    "        vec_otheruser = []\n",
    "        if userID != otheruserId:\n",
    "            for itemId in items:   # 遍历物品-用户评分数据\n",
    "                itemRatings = items[itemId]        # 这也是个字典  每条数据为所有用户对当前物品的评分\n",
    "                if userID in itemRatings and otheruserId in itemRatings:  # 说明两个用户都对该物品评过分\n",
    "                    vec_user.append(itemRatings[userID])\n",
    "                    vec_otheruser.append(itemRatings[otheruserId])\n",
    "            # 这里可以获得相似性矩阵(共现矩阵)\n",
    "            similarity_matrix[userID][otheruserId] = np.corrcoef(np.array(vec_user), np.array(vec_otheruser))[0][1]\n",
    "            #similarity_matrix[userID][otheruserId] = cosine_similarity(np.array(vec_user), np.array(vec_otheruser))[0][1]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-08-20T12:08:43.001297Z",
     "start_time": "2020-08-20T12:08:42.993359Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>1</th>\n",
       "      <th>2</th>\n",
       "      <th>3</th>\n",
       "      <th>4</th>\n",
       "      <th>5</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.852803</td>\n",
       "      <td>0.707107</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>-0.792118</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>0.852803</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.467707</td>\n",
       "      <td>0.489956</td>\n",
       "      <td>-0.900149</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>0.707107</td>\n",
       "      <td>0.467707</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>-0.161165</td>\n",
       "      <td>-0.466569</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.489956</td>\n",
       "      <td>-0.161165</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>-0.641503</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>-0.792118</td>\n",
       "      <td>-0.900149</td>\n",
       "      <td>-0.466569</td>\n",
       "      <td>-0.641503</td>\n",
       "      <td>0.000000</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "          1         2         3         4         5\n",
       "1  0.000000  0.852803  0.707107  0.000000 -0.792118\n",
       "2  0.852803  0.000000  0.467707  0.489956 -0.900149\n",
       "3  0.707107  0.467707  0.000000 -0.161165 -0.466569\n",
       "4  0.000000  0.489956 -0.161165  0.000000 -0.641503\n",
       "5 -0.792118 -0.900149 -0.466569 -0.641503  0.000000"
      ]
     },
     "execution_count": 25,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "similarity_matrix"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-08-20T11:51:52.388278Z",
     "start_time": "2020-08-20T11:51:52.383237Z"
    }
   },
   "outputs": [],
   "source": [
    "\"\"\"计算前n个相似的用户\"\"\"\n",
    "n = 2\n",
    "similarity_users = similarity_matrix[1].sort_values(ascending=False)[:n].index.tolist()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-08-20T11:53:56.299648Z",
     "start_time": "2020-08-20T11:53:56.286683Z"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "用户Alice对物品5的打分:  4.871979899370592\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>A</th>\n",
       "      <th>B</th>\n",
       "      <th>C</th>\n",
       "      <th>D</th>\n",
       "      <th>E</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>5.0</td>\n",
       "      <td>3.0</td>\n",
       "      <td>4.0</td>\n",
       "      <td>4.0</td>\n",
       "      <td>4.87198</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>3.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>2.0</td>\n",
       "      <td>3.0</td>\n",
       "      <td>3.00000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>4.0</td>\n",
       "      <td>3.0</td>\n",
       "      <td>4.0</td>\n",
       "      <td>3.0</td>\n",
       "      <td>5.00000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>3.0</td>\n",
       "      <td>3.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>5.0</td>\n",
       "      <td>4.00000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>1.0</td>\n",
       "      <td>5.0</td>\n",
       "      <td>5.0</td>\n",
       "      <td>2.0</td>\n",
       "      <td>1.00000</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "     A    B    C    D        E\n",
       "1  5.0  3.0  4.0  4.0  4.87198\n",
       "2  3.0  1.0  2.0  3.0  3.00000\n",
       "3  4.0  3.0  4.0  3.0  5.00000\n",
       "4  3.0  3.0  1.0  5.0  4.00000\n",
       "5  1.0  5.0  5.0  2.0  1.00000"
      ]
     },
     "execution_count": 24,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "\"\"\"计算最终得分\"\"\"\n",
    "base_score = np.mean(np.array([value for value in users[1].values()]))\n",
    "weighted_scores = 0.\n",
    "corr_values_sum = 0.\n",
    "for user in similarity_users:  # [2, 3]\n",
    "    corr_value = similarity_matrix[1][user]            # 两个用户之间的相似性\n",
    "    mean_user_score = np.mean(np.array([value for value in users[user].values()]))    # 每个用户的打分平均值\n",
    "    weighted_scores += corr_value * (users[user]['E']-mean_user_score)      # 加权分数\n",
    "    corr_values_sum += corr_value\n",
    "final_scores = base_score + weighted_scores / corr_values_sum\n",
    "print('用户Alice对物品5的打分: ', final_scores)\n",
    "user_df.loc[1]['E'] = final_scores\n",
    "user_df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-08-20T12:21:02.662279Z",
     "start_time": "2020-08-20T12:21:02.651725Z"
    }
   },
   "outputs": [],
   "source": [
    "\"\"\"计算物品的相似矩阵\"\"\"\n",
    "similarity_matrix = pd.DataFrame(np.ones((len(items), len(items))), index=['A', 'B', 'C', 'D', 'E'], columns=['A', 'B', 'C', 'D', 'E'])\n",
    "\n",
    "# 遍历每条物品-用户评分数据\n",
    "for itemId in items:\n",
    "    for otheritemId in items:\n",
    "        vec_item = []         # 定义列表， 保存当前两个物品的向量值\n",
    "        vec_otheritem = []\n",
    "        #userRagingPairCount = 0     # 两件物品均评过分的用户数\n",
    "        if itemId != otheritemId:    # 物品不同\n",
    "            for userId in users:    # 遍历用户-物品评分数据\n",
    "                userRatings = users[userId]    # 每条数据为该用户对所有物品的评分， 这也是个字典\n",
    "                \n",
    "                if itemId in userRatings and otheritemId in userRatings:   # 用户对这两个物品都评过分\n",
    "                    #userRagingPairCount += 1\n",
    "                    vec_item.append(userRatings[itemId])\n",
    "                    vec_otheritem.append(userRatings[otheritemId])\n",
    "            \n",
    "            # 这里可以获得相似性矩阵(共现矩阵)\n",
    "            similarity_matrix[itemId][otheritemId] = np.corrcoef(np.array(vec_item), np.array(vec_otheritem))[0][1]\n",
    "            #similarity_matrix[itemId][otheritemId] = cosine_similarity(np.array(vec_item), np.array(vec_otheritem))[0][1]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-08-20T12:21:10.364699Z",
     "start_time": "2020-08-20T12:21:10.357658Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>A</th>\n",
       "      <th>B</th>\n",
       "      <th>C</th>\n",
       "      <th>D</th>\n",
       "      <th>E</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>A</th>\n",
       "      <td>1.000000</td>\n",
       "      <td>-0.476731</td>\n",
       "      <td>-0.123091</td>\n",
       "      <td>0.532181</td>\n",
       "      <td>0.969458</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>B</th>\n",
       "      <td>-0.476731</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>0.645497</td>\n",
       "      <td>-0.310087</td>\n",
       "      <td>-0.478091</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>C</th>\n",
       "      <td>-0.123091</td>\n",
       "      <td>0.645497</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>-0.720577</td>\n",
       "      <td>-0.427618</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>D</th>\n",
       "      <td>0.532181</td>\n",
       "      <td>-0.310087</td>\n",
       "      <td>-0.720577</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>0.581675</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>E</th>\n",
       "      <td>0.969458</td>\n",
       "      <td>-0.478091</td>\n",
       "      <td>-0.427618</td>\n",
       "      <td>0.581675</td>\n",
       "      <td>1.000000</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "          A         B         C         D         E\n",
       "A  1.000000 -0.476731 -0.123091  0.532181  0.969458\n",
       "B -0.476731  1.000000  0.645497 -0.310087 -0.478091\n",
       "C -0.123091  0.645497  1.000000 -0.720577 -0.427618\n",
       "D  0.532181 -0.310087 -0.720577  1.000000  0.581675\n",
       "E  0.969458 -0.478091 -0.427618  0.581675  1.000000"
      ]
     },
     "execution_count": 28,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "similarity_matrix"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-08-20T11:37:17.269269Z",
     "start_time": "2020-08-20T11:37:17.264282Z"
    }
   },
   "outputs": [],
   "source": [
    "\"\"\"得到与物品5相似的前n个物品\"\"\"\n",
    "n = 2\n",
    "similarity_items = similarity_matrix['E'].sort_values(ascending=False)[:n].index.tolist()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-08-20T11:37:56.982844Z",
     "start_time": "2020-08-20T11:37:56.970349Z"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "用户Alice对物品5的打分:  4.6\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>A</th>\n",
       "      <th>B</th>\n",
       "      <th>C</th>\n",
       "      <th>D</th>\n",
       "      <th>E</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>5.0</td>\n",
       "      <td>3.0</td>\n",
       "      <td>4.0</td>\n",
       "      <td>4.0</td>\n",
       "      <td>4.6</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>3.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>2.0</td>\n",
       "      <td>3.0</td>\n",
       "      <td>3.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>4.0</td>\n",
       "      <td>3.0</td>\n",
       "      <td>4.0</td>\n",
       "      <td>3.0</td>\n",
       "      <td>5.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>3.0</td>\n",
       "      <td>3.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>5.0</td>\n",
       "      <td>4.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>1.0</td>\n",
       "      <td>5.0</td>\n",
       "      <td>5.0</td>\n",
       "      <td>2.0</td>\n",
       "      <td>1.0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "     A    B    C    D    E\n",
       "1  5.0  3.0  4.0  4.0  4.6\n",
       "2  3.0  1.0  2.0  3.0  3.0\n",
       "3  4.0  3.0  4.0  3.0  5.0\n",
       "4  3.0  3.0  1.0  5.0  4.0\n",
       "5  1.0  5.0  5.0  2.0  1.0"
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "\"\"\"计算最终得分\"\"\"\n",
    "base_score = np.mean(np.array([value for value in items['E'].values()]))\n",
    "weighted_scores = 0.\n",
    "corr_values_sum = 0.\n",
    "for item in similarity_items:  # ['A', 'D']\n",
    "    corr_value = similarity_matrix['E'][item]            # 两个物品之间的相似性\n",
    "    mean_item_score = np.mean(np.array([value for value in items[item].values()]))    # 每个物品的打分平均值\n",
    "    weighted_scores += corr_value * (users[1][item]-mean_item_score)      # 加权分数\n",
    "    corr_values_sum += corr_value\n",
    "final_scores = base_score + weighted_scores / corr_values_sum\n",
    "print('用户Alice对物品5的打分: ', final_scores)\n",
    "user_df.loc[1]['E'] = final_scores\n",
    "user_df"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "toc": {
   "base_numbering": 1,
   "nav_menu": {},
   "number_sections": true,
   "sideBar": true,
   "skip_h1_title": false,
   "title_cell": "Table of Contents",
   "title_sidebar": "Contents",
   "toc_cell": false,
   "toc_position": {},
   "toc_section_display": true,
   "toc_window_display": false
  },
  "varInspector": {
   "cols": {
    "lenName": 16,
    "lenType": 16,
    "lenVar": 40
   },
   "kernels_config": {
    "python": {
     "delete_cmd_postfix": "",
     "delete_cmd_prefix": "del ",
     "library": "var_list.py",
     "varRefreshCmd": "print(var_dic_list())"
    },
    "r": {
     "delete_cmd_postfix": ") ",
     "delete_cmd_prefix": "rm(",
     "library": "var_list.r",
     "varRefreshCmd": "cat(var_dic_list()) "
    }
   },
   "types_to_exclude": [
    "module",
    "function",
    "builtin_function_or_method",
    "instance",
    "_Feature"
   ],
   "window_display": false
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
